/*
 *  zonelist_order:
 *  0 = automatic detection of better ordering.
 *  1 = order by ([node] distance, -zonetype)
 *  2 = order by (-zonetype, [node] distance)
 *
 *  If not NUMA, ZONELIST_ORDER_ZONE and ZONELIST_ORDER_NODE will create
 *  the same zonelist. So only NUMA can configure this param.
 */
#define ZONELIST_ORDER_DEFAULT  0
#define ZONELIST_ORDER_NODE     1
#define ZONELIST_ORDER_ZONE     2

static int current_zonelist_order = ZONELIST_ORDER_DEFAULT;


/* The value user specified ....changed by config */
static int user_zonelist_order = ZONELIST_ORDER_DEFAULT;
/* string for sysctl */
#define NUMA_ZONELIST_ORDER_LEN	16
char numa_zonelist_order[16] = "default";


/*
 * Called with zonelists_mutex held always
 * unless system_state == SYSTEM_BOOTING.
 */
void __ref build_all_zonelists(pg_data_t *pgdat, struct zone *zone)
{
	/* 我们的系统为       ZONELIST_ORDER_ZONE  */
	set_zonelist_order();

	if (system_state == SYSTEM_BOOTING) {
		__build_all_zonelists(NULL);
		mminit_verify_zonelist();
		cpuset_init_current_mems_allowed();
	} else {
#ifdef CONFIG_MEMORY_HOTPLUG
		if (zone)
			setup_zone_pageset(zone);
#endif
		/* we have to stop all cpus to guarantee there is no user
		   of zonelist */
		stop_machine(__build_all_zonelists, pgdat, NULL);
		/* cpuset refresh routine should be here */
	}
	vm_total_pages = nr_free_pagecache_pages();
	/*
	 * Disable grouping by mobility if the number of pages in the
	 * system is too low to allow the mechanism to work. It would be
	 * more accurate, but expensive to check per-zone. This check is
	 * made on memory-hotadd so a system can start with mobility
	 * disabled and enable it later
	 */
	if (vm_total_pages < (pageblock_nr_pages * MIGRATE_TYPES))
		page_group_by_mobility_disabled = 1;
	else
		page_group_by_mobility_disabled = 0;

	printk("Built %i zonelists in %s order, mobility grouping %s.  "
		"Total pages: %ld\n",
			nr_online_nodes,
			zonelist_order_name[current_zonelist_order],
			page_group_by_mobility_disabled ? "off" : "on",
			vm_total_pages);
#ifdef CONFIG_NUMA
	printk("Policy zone: %s\n", zone_names[policy_zone]);
#endif
}

static void set_zonelist_order(void)
{
	if (user_zonelist_order == ZONELIST_ORDER_DEFAULT)
		current_zonelist_order = default_zonelist_order();
	else
		current_zonelist_order = user_zonelist_order;
}

static int default_zonelist_order(void)
{
	int nid, zone_type;
	unsigned long low_kmem_size,total_size;
	struct zone *z;
	int average_size;
	/*
	 * ZONE_DMA and ZONE_DMA32 can be very small area in the system.
	 * If they are really small and used heavily, the system can fall
	 * into OOM very easily.
	 * This function detect ZONE_DMA/DMA32 size and configures zone order.
	 */
	/* Is there ZONE_NORMAL ? (ex. ppc has only DMA zone..) */
	low_kmem_size = 0;
	total_size = 0;
	for_each_online_node(nid) {
		for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) {
			z = &NODE_DATA(nid)->node_zones[zone_type];
			if (populated_zone(z)) {
				if (zone_type < ZONE_NORMAL)
					low_kmem_size += z->present_pages;
				total_size += z->present_pages;
			} else if (zone_type == ZONE_NORMAL) {
				/*
				 * If any node has only lowmem, then node order
				 * is preferred to allow kernel allocations
				 * locally; otherwise, they can easily infringe
				 * on other nodes when there is an abundance of
				 * lowmem available to allocate from.
				 */
				return ZONELIST_ORDER_NODE;
			}
		}
	}
	if (!low_kmem_size ||  /* there are no DMA area. */
	    low_kmem_size > total_size/2) /* DMA/DMA32 is big. */
		return ZONELIST_ORDER_NODE;
	/*
	 * look into each node's config.
  	 * If there is a node whose DMA/DMA32 memory is very big area on
 	 * local memory, NODE_ORDER may be suitable.
     */
	average_size = total_size / (nodes_weight(node_states[N_MEMORY]) + 1);

	for_each_online_node(nid) {
		low_kmem_size = 0;
		total_size = 0;
		for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) {
			z = &NODE_DATA(nid)->node_zones[zone_type];
			if (populated_zone(z)) {
				if (zone_type < ZONE_NORMAL)
					low_kmem_size += z->present_pages;
				total_size += z->present_pages;
			}
		}
		if (low_kmem_size &&
		    total_size > average_size && /* ignore small node */
		    low_kmem_size > total_size * 70/100)
			return ZONELIST_ORDER_NODE;
	}
	return ZONELIST_ORDER_ZONE;
}

/* return values int ....just for stop_machine() */
static int __build_all_zonelists(void *data)
{
	int nid;
	int cpu;
	pg_data_t *self = data;

#ifdef CONFIG_NUMA
	memset(node_load, 0, sizeof(node_load));
#endif

	if (self && !node_online(self->node_id)) {
		build_zonelists(self);
		build_zonelist_cache(self);
	}

	for_each_online_node(nid) {
		pg_data_t *pgdat = NODE_DATA(nid);

		build_zonelists(pgdat);
		build_zonelist_cache(pgdat);
	}

	/*
	 * Initialize the boot_pagesets that are going to be used
	 * for bootstrapping processors. The real pagesets for
	 * each zone will be allocated later when the per cpu
	 * allocator is available.
	 *
	 * boot_pagesets are used also for bootstrapping offline
	 * cpus if the system is already booted because the pagesets
	 * are needed to initialize allocators on a specific cpu too.
	 * F.e. the percpu allocator needs the page allocator which
	 * needs the percpu allocator in order to allocate its pagesets
	 * (a chicken-egg dilemma).
	 */
	for_each_possible_cpu(cpu) {
		setup_pageset(&per_cpu(boot_pageset, cpu), 0);

#ifdef CONFIG_HAVE_MEMORYLESS_NODES
		/*
		 * We now know the "local memory node" for each node--
		 * i.e., the node of the first zone in the generic zonelist.
		 * Set up numa_mem percpu variable for on-line cpus.  During
		 * boot, only the boot cpu should be on-line;  we'll init the
		 * secondary cpus' numa_mem as they come on-line.  During
		 * node/memory hotplug, we'll fixup all on-line cpus.
		 */
		if (cpu_online(cpu))
			set_cpu_numa_mem(cpu, local_memory_node(cpu_to_node(cpu)));
#endif
	}

	return 0;
}


/* Maximum number of zones on a zonelist */
#define MAX_ZONES_PER_ZONELIST (MAX_NUMNODES * MAX_NR_ZONES)
/*
 * The NUMA zonelists are doubled because we need zonelists that restrict the
 * allocations to a single node for __GFP_THISNODE.
 *
 * [0]	: Zonelist with fallback
 * [1]	: No fallback (__GFP_THISNODE)
 */
#define MAX_ZONELISTS 2


/*
 * This struct contains information about a zone in a zonelist. It is stored
 * here to avoid dereferences into large structures and lookups of tables
 */
struct zoneref {
	struct zone *zone;	/* Pointer to actual zone */
	int zone_idx;		/* zone_idx(zoneref->zone) */
};

/*
 * One allocation request operates on a zonelist. A zonelist
 * is a list of zones, the first one is the 'goal' of the
 * allocation, the other zones are fallback zones, in decreasing
 * priority.
 *
 * If zlcache_ptr is not NULL, then it is just the address of zlcache,
 * as explained above.  If zlcache_ptr is NULL, there is no zlcache.
 * *
 * To speed the reading of the zonelist, the zonerefs contain the zone index
 * of the entry being read. Helper functions to access information given
 * a struct zoneref are
 *
 * zonelist_zone()	- Return the struct zone * for an entry in _zonerefs
 * zonelist_zone_idx()	- Return the index of the zone for an entry
 * zonelist_node_idx()	- Return the index of the node for an entry
 */
struct zonelist {
	struct zonelist_cache *zlcache_ptr;		     // NULL or &zlcache
	struct zoneref _zonerefs[MAX_ZONES_PER_ZONELIST + 1];
#ifdef CONFIG_NUMA
	struct zonelist_cache zlcache;			     // optional ...
#endif
};

/*
 * The pg_data_t structure is used in machines with CONFIG_DISCONTIGMEM
 * (mostly NUMA machines?) to denote a higher-level memory zone than the
 * zone denotes.
 *
 * On NUMA machines, each NUMA node would have a pg_data_t to describe
 * it's memory layout.
 *
 * Memory statistics and page replacement data structures are maintained on a
 * per-zone basis.
 */
struct bootmem_data;
typedef struct pglist_data {
	struct zone node_zones[MAX_NR_ZONES];
	struct zonelist node_zonelists[MAX_ZONELISTS];
	int nr_zones;
#ifdef CONFIG_FLAT_NODE_MEM_MAP	/* means !SPARSEMEM */
	struct page *node_mem_map;
#ifdef CONFIG_MEMCG
	struct page_cgroup *node_page_cgroup;
#endif
#endif
#ifndef CONFIG_NO_BOOTMEM
	struct bootmem_data *bdata;
#endif
#ifdef CONFIG_MEMORY_HOTPLUG
	/*
	 * Must be held any time you expect node_start_pfn, node_present_pages
	 * or node_spanned_pages stay constant.  Holding this will also
	 * guarantee that any pfn_valid() stays that way.
	 *
	 * Nests above zone->lock and zone->size_seqlock.
	 */
	spinlock_t node_size_lock;
#endif
	unsigned long node_start_pfn;
	unsigned long node_present_pages; /* total number of physical pages */
	unsigned long node_spanned_pages; /* total size of physical page
					     range, including holes */
	int node_id;
	nodemask_t reclaim_nodes;	/* Nodes allowed to reclaim from */
	wait_queue_head_t kswapd_wait;
	wait_queue_head_t pfmemalloc_wait;
	struct task_struct *kswapd;	/* Protected by
					   mem_hotplug_begin/end() */
	int kswapd_max_order;
	enum zone_type classzone_idx;
#ifdef CONFIG_NUMA_BALANCING
	/* Lock serializing the migrate rate limiting window */
	spinlock_t numabalancing_migrate_lock;

	/* Rate limiting time interval */
	unsigned long numabalancing_migrate_next_window;

	/* Number of pages migrated during the rate limiting time interval */
	unsigned long numabalancing_migrate_nr_pages;
#endif

#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
	/*
	 * If memory initialisation on large machines is deferred then this
	 * is the first PFN that needs to be initialised.
	 */
	RH_KABI_USE(1, unsigned long first_deferred_pfn)
#else
	RH_KABI_RESERVE(1)
#endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */

#ifdef CONFIG_ZONE_DEVICE
	RH_KABI_USE(2, struct zone *zone_device)
#else
	RH_KABI_RESERVE(2)
#endif

	/* reserved for Red Hat */
	RH_KABI_RESERVE(3)
	RH_KABI_RESERVE(4)
} pg_data_t;

static int node_order[MAX_NUMNODES];


static void build_zonelists(pg_data_t *pgdat)
{
	int j, node, load;
	enum zone_type i;
	nodemask_t used_mask;
	int local_node, prev_node;
	struct zonelist *zonelist;
	int order = current_zonelist_order;

	/* initialize zonelists */
	for (i = 0; i < MAX_ZONELISTS; i++) {
		zonelist = pgdat->node_zonelists + i;
		zonelist->_zonerefs[0].zone = NULL;
		zonelist->_zonerefs[0].zone_idx = 0;
	}

	/* NUMA-aware ordering of nodes */
	local_node = pgdat->node_id;
	load = nr_online_nodes;
	prev_node = local_node;
	nodes_clear(used_mask);

	memset(node_order, 0, sizeof(node_order));
	j = 0;

	/* 找到一个合适的node，作为node的fallback */
	while ((node = find_next_best_node(local_node, &used_mask)) >= 0) {
		/*
		 * We don't want to pressure a particular node.
		 * So adding penalty to the first node in same
		 * distance group to make it round-robin.
		 */
		if (node_distance(local_node, node) !=
		    node_distance(local_node, prev_node))
			node_load[node] = load;

		prev_node = node;
		load--;
		if (order == ZONELIST_ORDER_NODE)
			build_zonelists_in_node_order(pgdat, node);
		else
			node_order[j++] = node;	/* remember order */
	}

	if (order == ZONELIST_ORDER_ZONE) {
		/* calculate node order -- i.e., DMA last! */
		build_zonelists_in_zone_order(pgdat, j);
	}

	build_thisnode_zonelists(pgdat);
}


/**
两个node的机器
处理 node0
kernel: build_zonelists node_id:0
有两个fallback node, node0和node1，
kernel: node:0
kernel: node:1

kernel: build_zonelists_in_zone_order node id:0
kernel: zone name:DMA addr:ffff88303ffd9000 populate: 1
kernel: zone name:DMA32 addr:ffff88303ffd9800 populate: 1
kernel: zone name:Normal addr:ffff88303ffda000 populate: 1
kernel: zone name:Movable addr:ffff88303ffda800 populate: 0

kernel: j:0 node:0 zone_type:3 zone name:Movable	node0的Movable zone(是没有实际物理页面的)
kernel: j:1 node:1 zone_type:3 zone name:Movable	node1的Movable zone(是没有实际物理页面的)

kernel: j:0 node:0 zone_type:2 zone name:Normal		node0的Normal
			将node0的Normal添加到node0的pgdat->node_zonelists._zonerefs[0]
kernel:     j:0 zone name:Normal zone_idx:2 zone:ffff88303ffda000 pos:1


kernel: j:1 node:1 zone_type:2 zone name:Normal		node1的Normal
			将node1的Normal添加到node0的pgdat->node_zonelists._zonerefs[1]
kernel:	    j:1 zone name:Normal zone_idx:2 zone:ffff88603ffd7000 pos:2

kernel: j:0 node:0 zone_type:1 zone name:DMA32		node0的DMA32
			将node0的DMA32添加到node0的pgdat->node_zonelists._zonerefs[2]
kernel:	    j:0 zone name:DMA32 zone_idx:1 zone:ffff88303ffd9800 pos:3

kernel: j:1 node:1 zone_type:1 zone name:DMA32		node1是没有DMA32的

kernel: j:0 node:0 zone_type:0 zone name:DMA		node0的DMA
			将node0的DMA添加到node0的pgdat->node_zonelists._zonerefs[3]
kernel:	    j:0 zone name:DMA zone_idx:0 zone:ffff88303ffd9000 pos:4

kernel: j:1 node:1 zone_type:0 zone name:DMA		node1是没有DMA的

处理node1
kernel: build_zonelists node_id:1
node1的fallback node
kernel: node:1
kernel: node:0
kernel: build_zonelists_in_zone_order node id:1

kernel: zone name:DMA addr:ffff88603ffd6000 populate: 0
kernel: zone name:DMA32 addr:ffff88603ffd6800 populate: 0
kernel: zone name:Normal addr:ffff88603ffd7000 populate: 1
kernel: zone name:Movable addr:ffff88603ffd7800 populate: 0

kernel: j:0 node:1 zone_type:3 zone name:Movable	node1的Movable zone(是没有实际物理页面的)
kernel: j:1 node:0 zone_type:3 zone name:Movable	node0的Movable zone(是没有实际物理页面的)

kernel: j:0 node:1 zone_type:2 zone name:Normal		node1的Normal
			将node1的Normal添加到node1的pgdat->node_zonelists._zonerefs[0]
kernel:	    j:0 zone name:Normal zone_idx:2 zone:ffff88603ffd7000 pos:1

kernel: j:1 node:0 zone_type:2 zone name:Normal		node0的Normal
			将node0的Normal添加到node1的pgdat->node_zonelists._zonerefs[1]
kernel:	    j:1 zone name:Normal zone_idx:2 zone:ffff88303ffda000 pos:2

kernel: j:0 node:1 zone_type:1 zone name:DMA32		node1的DMA32，node1没有DMA32

kernel: j:1 node:0 zone_type:1 zone name:DMA32		node0的DMA32
			将node0的DMA32添加到node1的pgdat->node_zonelists._zonerefs[3]
kernel:	    j:1 zone name:DMA32 zone_idx:1 zone:ffff88303ffd9800 pos:3
kernel: j:0 node:1 zone_type:0 zone name:DMA		node1的DMA，node1没有DMA

kernel: j:1 node:0 zone_type:0 zone name:DMA		node0的DMA
			将node0的DMA添加到node1的pgdat->node_zonelists._zonerefs[3]
kernel:	    j:1 zone name:DMA zone_idx:0 zone:ffff88303ffd9000 pos:4
*/
static void build_zonelists_in_zone_order(pg_data_t *pgdat, int nr_nodes)
{
	int pos, j, node;
	int zone_type;		/* needs to be signed */
	struct zone *z;
	struct zonelist *zonelist;

	zonelist = &pgdat->node_zonelists[0];
	pos = 0;
	/*node_order中的每一node的zone_type 添加到    zonelist中*/
	for (zone_type = MAX_NR_ZONES - 1; zone_type >= 0; zone_type--) {
		for (j = 0; j < nr_nodes; j++) {
			node = node_order[j];
			z = &NODE_DATA(node)->node_zones[zone_type];
			if (populated_zone(z)) {
				zoneref_set_zone(z, &zonelist->_zonerefs[pos++]);
				check_highest_zone(zone_type);
			}
		}
	}
	zonelist->_zonerefs[pos].zone = NULL;
	zonelist->_zonerefs[pos].zone_idx = 0;
}

/*
 * zone_idx() returns 0 for the ZONE_DMA zone, 1 for the ZONE_NORMAL zone, etc.
 */
#ifdef CONFIG_ZONE_DEVICE
#define zone_idx(zone)		(is_dev_zone(zone) ? ZONE_DEVICE : \
				 (zone) - (zone)->zone_pgdat->node_zones)
#else
#define zone_idx(zone)		((zone) - (zone)->zone_pgdat->node_zones)
#endif


static void zoneref_set_zone(struct zone *zone, struct zoneref *zoneref)
{
	zoneref->zone = zone;
	zoneref->zone_idx = zone_idx(zone);
}

static void build_thisnode_zonelists(pg_data_t *pgdat)
{
	int j;
	struct zonelist *zonelist;

	zonelist = &pgdat->node_zonelists[1];
	j = build_zonelists_node(pgdat, zonelist, 0, MAX_NR_ZONES - 1);
	zonelist->_zonerefs[j].zone = NULL;
	zonelist->_zonerefs[j].zone_idx = 0;
}

/*
 * Builds allocation fallback zone lists.
 *
 * Add all populated zones of a node to the zonelist.
 */
static int build_zonelists_node(pg_data_t *pgdat, struct zonelist *zonelist,
				int nr_zones, enum zone_type zone_type)
{
	struct zone *zone;

	zone_type++;

	/* 本node的zone   fallback，分别是normal,dma32,dma */
	do {
		zone_type--;
		zone = pgdat->node_zones + zone_type;
		if (populated_zone(zone)) {
			zoneref_set_zone(zone, &zonelist->_zonerefs[nr_zones++]);
			check_highest_zone(zone_type);
		}

	} while (zone_type);

	return nr_zones;
}


